
/***************************************************************************
                                                                           *
Copyright 2013 CertiVox UK Ltd.                                           *
                                                                           *
This file is part of CertiVox MIRACL Crypto SDK.                           *
                                                                           *
The CertiVox MIRACL Crypto SDK provides developers with an                 *
extensive and efficient set of cryptographic functions.                    *
For further information about its features and functionalities please      *
refer to http://www.certivox.com                                           *
                                                                           *
* The CertiVox MIRACL Crypto SDK is free software: you can                 *
  redistribute it and/or modify it under the terms of the                  *
  GNU Affero General Public License as published by the                    *
  Free Software Foundation, either version 3 of the License,               *
  or (at your option) any later version.                                   *
                                                                           *
* The CertiVox MIRACL Crypto SDK is distributed in the hope                *
  that it will be useful, but WITHOUT ANY WARRANTY; without even the       *
  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
  See the GNU Affero General Public License for more details.              *
                                                                           *
* You should have received a copy of the GNU Affero General Public         *
  License along with CertiVox MIRACL Crypto SDK.                           *
  If not, see <http://www.gnu.org/licenses/>.                              *
                                                                           *
You can be released from the requirements of the license by purchasing     *
a commercial license. Buying such a license is mandatory as soon as you    *
develop commercial activities involving the CertiVox MIRACL Crypto SDK     *
without disclosing the source code of your own applications, or shipping   *
the CertiVox MIRACL Crypto SDK with a closed source product.               *
                                                                           *
***************************************************************************/
/*
 *   Many processors support a floating-point coprocessor, which may
 *   implement a faster multiplication instruction than the corresponding
 *   integer instruction. This is the case for the Pentium processor
 *   which has a built-in co-processor. This can be exploited to give even
 *   faster performance.
 *
 *   Note that since the partial products are accumulated in a 64-bit register
 *   this implies that a full-width number base (2^32) cannot be used.
 *   The maximum number base that can be used is 2^x where x is
 *   calculated such that 2^(64-2*x) > 2*WORDS_IN_MODULUS. This means that
 *   x will usually be 28 or 29
 *
 *   To use this code:-
 *
 *   (1) Implemented and tested only for the Pentium processor and
 *       using the Borland C compiler (BCC and BCC32)
 *
 *   (2) Define MR_PENTIUM in mirdef.h. Determine the maximum modulus to be
 *       used, and from that determine the value of WORDS_IN_MODULUS.
 *
 *   (3) Use as a number base the value of x calculated as shown above.
 *       For example, for 512 bit exponentiation, WORDS_IN_MODULUS will be 18
 *       so call mirsys(50,536870912L) in your main program.
 *       (Observe that 536870912 = 2^29, and that 18*29 = 522, big enough
 *       for 512 bit calculations).
 *
 *   (4) Use Montgomery representation when implementing your crypto-system
 *       e.g. use monty_powmod(). This will automatically call the
 *       routines in this module.
 *
 *   Note that it is *VITAL* that double arrays be aligned on 8-byte
 *   boundaries for the Pentium. The Borland C compiler does *not* do this
 *   automatically!!!!
 *
 *   Many thanks are due to Paul Rubin, who suggested to me that this approach
 *   might be faster than the all-integer methods described elsewhere.
 *
 *   Further speed increases can be acheived by loop-unrolling. Completely
 *   unrolled code (a la Comba) has been experimented with, and gives a
 *   25% speed-up in some cases. Note that the basic code for a single partial
 *   product takes only 3 cycles.
 *
 *          fld ...       ;1 cycle
 *          fmul ...      ;1 cycle
 *          fxch st(2)    ;0 cycle
 *          fadd          ;1 cycle
 *
 *   Compare this with the integer "mul" instruction which takes 10 cycles
 *   on a Pentium
 *
 *   Interestingly the fmul is faster than the fimul. So paradoxically it is
 *   quicker to manipulate 64-bit doubles than it is to manipulate 32-bit
 *   integers. Clearly the Pentium FP processor has been optimised for real
 *   arithmetic. However this requires us to convert all bigs from integer
 *   arrays to double arrays (see mrmonty.c) which is very wasteful of space
 *   and rather awkward.
 *
 *
 *   The FP stack is primed in prepare_monty() :-
 *   magic  - (2^63+2^62)*base. By adding and then subtracting this number we
 *            get the top half of the sum.
 *   1/base - Inverse of the number base
 *   ndash  - Montgomery's constant
 *
 */

#include "miracl.h"

#ifdef MR_PENTIUM
#define N       8
#define POINTER QWORD PTR

#if INLINE_ASM == 1
#define PAX ax
#define PBP bp
#define PBX bx
#define PSI si
#define PDI di
#define PCX cx
#define PDX dx
#endif

#if INLINE_ASM == 2
#define PAX ax
#define PBP bp
#define PBX bx
#define PSI si
#define PDI di
#define PCX cx
#define PDX dx
#endif

#if INLINE_ASM == 3
#define PAX eax
#define PBP ebp
#define PBX ebx
#define PSI esi
#define PDI edi
#define PCX ecx
#define PDX edx
#endif

#ifdef INLINE_ASM
#ifndef MR_LMM
/* not implemented for large memory model 16 bit */

void fastmodmult(_MIPD_ big x, big y, big z)
{
    int ij, rn, nrn;
#ifdef MR_OS_THREADS
    miracl* mr_mip = get_mip();
#endif
    big       modulus = mr_mip->modulus;
    big       w0      = mr_mip->w0;
    mr_small *wg, *mg, *xg, *yg;
    wg = w0->w;
    mg = modulus->w;
    xg = x->w;
    yg = y->w;
    rn = (int)modulus->len;
    for (ij = 2 * rn; ij < (int)(w0->len & MR_OBITS); ij++) w0->w[ij] = 0.0;
    w0->len = 2 * rn;
    nrn     = N * rn;

    ASM push PBP ASM push PDI ASM push PSI

        ASM mov    PBX,
        xg ASM mov PSI, yg ASM mov PDX, mg ASM mov PDI, wg ASM mov PAX,
        nrn

            ASM mov PBP,
        N

                ASM fldz

                    ASM xor
            PCX,
        PCX m1 : ASM push PCX ASM add PSI,
                 PCX ASM fld POINTER[PBX] ASM fmul POINTER[PSI] ASM add PBX,
                 PBP ASM sub                                            PSI,
                 PBP ASM test                                           PCX,
                 PCX ASM jz m3                                          m2
        : ASM fld POINTER[PBX] ASM fmul POINTER[PSI] ASM fxch st(2) ASM add PBX,
          PBP ASM sub                                                       PSI,
          PBP ASM fadd ASM sub                                              PCX,
          PBP ASM jnz m2                                                    m3
        : ASM sub                                                           PBX,
          PBP ASM add                                                       PSI,
          PBP ASM fadd ASM pop PCX ASM sub                                  PBX,
          PCX /* restore PBX */

              ASM xchg PSI,
          PDX /* PSI -> modulus */
              ASM push PCX ASM test PCX,
          PCX ASM jz m6 ASM add     PSI,
          PCX ASM fld POINTER[PDI] ASM fmul POINTER[PSI] ASM add PDI,
          PBP ASM sub                                            PSI,
          PBP ASM sub                                            PCX,
          PBP ASM jz m5 m4 : /* this is typical of the critical inner loop */
                             ASM fld  POINTER[PDI] /* 1 cycle  */
                             ASM fmul POINTER[PSI] /* 1 cycle  */
                             ASM fxch st(2)        /* 0 cycle  */
                             ASM add  PDI,
                             PBP /* 1 cycle  */
                                 ASM sub PSI,
                             PBP          /* 0 cycle  */
                                 ASM fadd /* 1 cycle  */
                                     ASM sub PCX,
                             PBP            /* 1 cycle  */
                                 ASM jnz m4 /* 0 cycle  */
                                            /* total = 5 cycles */
                                     m5 : ASM fadd m6 :

        ASM fld st(0) ASM fadd st,
        st(2) ASM fsub         st,
        st(2) ASM fsubr        st,
        st(1) ASM fmul         st,
        st(4) ASM fld st(0) ASM fadd st,
        st(3) ASM fsub               st,
        st(3) ASM fsub ASM fst POINTER[PDI] ASM fmul POINTER[PSI] ASM fadd ASM
            fmul                                     st,
        st(2)

            ASM xchg            PSI,
        PDX ASM pop PCX ASM sub PDI,
        PCX /* restore PDI */

            ASM add PCX,
        PBP /* increment PCX */
            ASM cmp PCX,
        PAX ASM jl m1

            ASM sub PCX,
        PBP /* PCX=12 */
            ASM add PSI,
        PCX ASM add PBX,
        PCX /* PBX -> x[4] */
            ASM add PDX,
        PCX ASM add PDI,
        PCX ASM sub PCX,
        PBP /* going back down again PCX=8 */
            m7 : ASM push PCX ASM sub PBX,
                 PCX ASM fld POINTER[PBX] ASM fmul POINTER[PSI] ASM add PBX,
                 PBP ASM sub                                            PSI,
                 PBP ASM test                                           PCX,
                 PCX ASM jz m9                                          m8
        : ASM fld POINTER[PBX] ASM fmul POINTER[PSI] ASM fxch st(2) ASM add PBX,
          PBP ASM sub                                                       PSI,
          PBP ASM fadd ASM sub                                              PCX,
          PBP ASM jnz m8 m9 : ASM sub                          PBX,
                              PBP ASM add                      PSI,
                              PBP ASM fadd ASM pop PCX ASM add PSI,
                              PCX /* restore PSI */

                                  ASM sub  PDI,
                              PCX ASM xchg PSI,
                              PDX /* PSI -> modulus */
                                  ASM push PCX ASM fld POINTER[PDI] ASM fmul
                                      POINTER[PSI] ASM add PDI,
                              PBP ASM sub                  PSI,
                              PBP ASM test                 PCX,
                              PCX ASM jz m11               m10
        : ASM fld POINTER[PDI] ASM fmul POINTER[PSI] ASM fxch st(2) ASM add PDI,
          PBP ASM sub                                                       PSI,
          PBP ASM fadd ASM sub                                              PCX,
          PBP ASM jnz m10                                                   m11
        : ASM sub                                                           PDI,
          PBP ASM add                                                       PSI,
          PBP ASM fadd ASM pop PCX ASM add                                  PSI,
          PCX /* restore PSI */
              ASM xchg PSI,
          PDX

              ASM push PDI ASM add PDI,
          PAX ASM sub              PDI,
          PCX ASM sub              PDI,
          PBP

              ASM fld st(0) ASM fadd st,
          st(2) ASM fsub             st,
          st(2) ASM fst st(5) ASM fmul st,
          st(3) ASM fxch st(5) ASM fsub ASM fstp POINTER[PDI] ASM fld st(3)

              ASM pop PDI

                  ASM sub PCX,
          PBP ASM jge m7

              ASM add PDI,
          PAX

              ASM fld st(0) ASM fadd st,
          st(2) ASM fsub             st,
          st(2) ASM fst st(5) ASM fmul st,
          st(3) ASM fxch st(5) ASM fsub ASM fstp POINTER[PDI] ASM fld st(3)

              ASM add  PDI,
          PBP ASM fstp POINTER[PDI]

          ASM pop PSI ASM pop PDI ASM pop PBP

          for (ij = rn; ij < (int)(z->len & MR_OBITS); ij++) z->w[ij] = 0.0;
    z->len                                                            = rn;
    for (ij = 0; ij < rn; ij++) z->w[ij] = w0->w[ij + rn];
    if (z->w[rn - 1] == 0.0) mr_lzero(z);
}

void fastmodsquare(_MIPD_ big x, big z)
{
    int ij, rn, nrn;
#ifdef MR_OS_THREADS
    miracl* mr_mip = get_mip();
#endif
    big       modulus = mr_mip->modulus;
    big       w0      = mr_mip->w0;
    mr_small *wg, *mg, *xg;
    wg = w0->w;
    mg = modulus->w;
    xg = x->w;

    rn = (int)modulus->len;
    for (ij = 2 * rn; ij < (int)(w0->len & MR_OBITS); ij++) w0->w[ij] = 0.0;
    w0->len = 2 * rn;
    nrn     = N * rn;

    ASM push PBP ASM push PDI ASM push PSI

        ASM mov    PBX,
        xg ASM mov PSI, xg ASM mov PDX, mg ASM mov PDI, wg ASM mov PAX,
        nrn

            ASM mov PBP,
        N

                ASM fldz

                    ASM xor
            PCX,
        PCX                                  s1
        : ASM push PBX ASM push PSI ASM test PCX,
          PCX ASM jz s4 ASM add              PSI,
          PCX ASM fstp                       st(5)
              ASM fldz ASM fld POINTER[PBX] ASM fmul POINTER[PSI] ASM sub PSI,
          PBP ASM add                                                     PBX,
          PBP ASM cmp                                                     PSI,
          PBX ASM jle s3                                                  s2
        : ASM fld POINTER[PBX] ASM fmul POINTER[PSI] ASM fxch st(2) ASM sub PSI,
          PBP ASM add                                                       PBX,
          PBP ASM fadd ASM cmp                                              PSI,
          PBX ASM jg s2 s3 : ASM fadd ASM fld st(0) ASM fadd ASM fadd st,
                             st(5) s4
        : ASM cmp                PSI,
          PBX ASM jne s5 ASM fld POINTER[PBX] ASM fmul st,
          st(0) ASM fadd                               s5
        : ASM pop PSI ASM pop PBX /* restore pointers */

              ASM xchg PSI,
          PDX /* PSI -> modulus */
              ASM push PCX ASM test PCX,
          PCX ASM jz s8 ASM add     PSI,
          PCX ASM fld POINTER[PDI] ASM fmul POINTER[PSI] ASM add PDI,
          PBP ASM sub                                            PSI,
          PBP ASM sub                                            PCX,
          PBP ASM jz s7                                          s6
        : ASM fld POINTER[PDI] ASM fmul POINTER[PSI] ASM fxch st(2) ASM add PDI,
          PBP ASM sub                                                       PSI,
          PBP ASM fadd ASM sub                                              PCX,
          PBP ASM jnz s6 s7 : ASM fadd s8 :

        ASM fld st(0) ASM fadd st,
        st(2) ASM fsub         st,
        st(2) ASM fsubr        st,
        st(1) ASM fmul         st,
        st(4) ASM fld st(0) ASM fadd st,
        st(3) ASM fsub               st,
        st(3) ASM fsub ASM fst POINTER[PDI] ASM fmul POINTER[PSI] ASM fadd ASM
            fmul                                     st,
        st(2)

            ASM xchg            PSI,
        PDX ASM pop PCX ASM sub PDI,
        PCX /* restore PDI */

            ASM add PCX,
        PBP /* increment PCX */
            ASM cmp PCX,
        PAX ASM jl s1

            ASM sub PCX,
        PBP /* PCX=12 */
            ASM add PSI,
        PCX ASM add PBX,
        PCX /* PBX -> x[4] */
            ASM add PDX,
        PCX ASM add PDI,
        PCX ASM sub PCX,
        PBP /* going back down again PCX=8 */
            s9 : ASM push PBX ASM push PSI ASM test PCX,
                 PCX ASM jz s13                     s10
        : ASM sub                                   PBX,
          PCX ASM fstp                              st(5)
              ASM fldz ASM fld POINTER[PBX] ASM fmul POINTER[PSI] ASM sub PSI,
          PBP ASM add                                                     PBX,
          PBP ASM cmp                                                     PSI,
          PBX ASM jle s12                                                 s11
        : ASM fld POINTER[PBX] ASM fmul POINTER[PSI] ASM fxch st(2) ASM sub PSI,
          PBP ASM add                                                       PBX,
          PBP ASM fadd ASM cmp                                              PSI,
          PBX ASM jg s11 s12 : ASM fadd ASM fld st(0) ASM fadd ASM fadd st,
                               st(5) s13
        : ASM cmp                 PSI,
          PBX ASM jne s14 ASM fld POINTER[PBX] ASM fmul st,
          st(0) ASM fadd s14 : ASM pop PSI ASM pop PBX

                                   ASM sub  PDI,
                               PCX ASM xchg PSI,
                               PDX /* PSI -> modulus */
                                   ASM push PCX ASM fld POINTER[PDI] ASM fmul
                                       POINTER[PSI] ASM add PDI,
                               PBP ASM sub                  PSI,
                               PBP ASM test                 PCX,
                               PCX ASM jz s16               s15
        : ASM fld POINTER[PDI] ASM fmul POINTER[PSI] ASM fxch st(2) ASM add PDI,
          PBP ASM sub                                                       PSI,
          PBP ASM fadd ASM sub                                              PCX,
          PBP ASM jnz s15                                                   s16
        : ASM sub                                                           PDI,
          PBP ASM add                                                       PSI,
          PBP ASM fadd ASM pop PCX ASM add                                  PSI,
          PCX /* restore PSI */
              ASM xchg PSI,
          PDX

              ASM push PDI ASM add PDI,
          PAX ASM sub              PDI,
          PCX ASM sub              PDI,
          PBP

              ASM fld st(0) ASM fadd st,
          st(2) ASM fsub             st,
          st(2) ASM fst st(5) ASM fmul st,
          st(3) ASM fxch st(5) ASM fsub ASM fstp POINTER[PDI] ASM fld st(3)

              ASM pop PDI

                  ASM sub PCX,
          PBP ASM jge s9

              ASM add PDI,
          PAX

              ASM fld st(0) ASM fadd st,
          st(2) ASM fsub             st,
          st(2) ASM fst st(5) ASM fmul st,
          st(3) ASM fxch st(5) ASM fsub ASM fstp POINTER[PDI] ASM fld st(3)

              ASM add  PDI,
          PBP ASM fstp POINTER[PDI]

          ASM pop PSI ASM pop PDI ASM pop PBP

          for (ij = rn; ij < (int)(z->len & MR_OBITS); ij++) z->w[ij] = 0.0;
    z->len                                                            = rn;
    for (ij = 0; ij < rn; ij++) z->w[ij] = w0->w[ij + rn];
    if (z->w[rn - 1] == 0.0) mr_lzero(z);
}

#endif
#endif
#endif
